In [1]:
import pandas as pd
import numpy as np
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
In [2]:
df = pd.read_csv('data/train_data2.csv', encoding='latin-1')
print(len(df))
df.head()
Out[2]:
In [3]:
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.DatetimeIndex(df['Released']).year
df['Month'] = pd.DatetimeIndex(df['Released']).month
df.head()
Out[3]:
In [4]:
df['Year'].describe().astype(int)
Out[4]:
In [5]:
# dictionary - year counts
yr_dict = df['Year'].value_counts().to_dict()
import operator
yr_lst = sorted(yr_dict.items(), key=operator.itemgetter(0)) # sort by year
yr_lst = yr_lst[::-1]
#print(yr_lst)
In [6]:
plt.figure(figsize=(25,10))
ind = np.arange(len(yr_dict))
width = 0.35
bar_year = [year for year, count in yr_lst]
bar_count = [count for year, count in yr_lst]
plt.bar(ind, bar_count, width, color='r')
plt.ylabel('Count')
plt.xlabel('Year')
plt.title('Number of Torrents per Year')
plt.xticks(ind + width/2., (bar_year), rotation='vertical')
plt.yticks(np.arange(0, 91, 5))
plt.show()
In [7]:
# cut off at year
before = len(df)
yr_cut_bot = 1998
yr_cut_top = 2015
mask = (df['Year'] >= yr_cut_bot) & (df['Year'] < yr_cut_top)
df_yr = df.loc[mask]
df_yr.sort_values('Year').head()
after = len(df_yr)
print('{0} entries lost ({1}%) due to date cutoff between {2} and {3}'.format(before-after,
round((before/after)/before *100, 2), yr_cut_bot, yr_cut_top))
In [8]:
# look at current data set AFTER year cutoff
plt.rcParams['figure.figsize'] = (15, 15)
_ = pd.tools.plotting.scatter_matrix(df_yr)
In [9]:
# unique list of grouped genres as strings
unq_genres = df_yr['Genre'].unique()
unq_genres = unq_genres.tolist()
#print(len(unq_genres))
#print(unq_genres[:10])
# unique list of grouped genres as lists
lst_grp_genres = []
for lst in unq_genres:
temp = []
for genre in lst.split(','):
temp.append(genre)
lst_grp_genres.append(temp)
#print(len(lst_grp_genres))
#print(lst_grp_genres)
In [10]:
# unique list of individual genres
ind_genre = set()
for lst in unq_genres:
for genre in lst.split(','):
ind_genre.add(genre.strip())
ind_genre = sorted(ind_genre)
#print(len(ind_genre))
#print(ind_genre)
In [11]:
# dictionary - count of genre occurences
count = defaultdict(lambda:0)
for genre in ind_genre:
count[genre] = df_yr.Genre.str.contains(genre).sum()
import operator
srt = sorted(count.items(), key=operator.itemgetter(1))
srt = srt[::-1]
#print(srt)
In [12]:
def split_to_array(ser):
split_array = np.array(ser.strip().replace(',','').split(' '))
return pd.Series(split_array)
genres = df_yr.Genre.apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
In [13]:
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
df_yr['Genre_Single'] = df_yr.Genre.apply(convert_frequency)
In [14]:
# select only genres of significance
genre = ['Action', 'Adventure', 'Comedy', 'Drama']
df_sub = df_yr.loc[df_yr['Genre_Single'].isin(genre)]
# select only genres of significance
ratings = ['PG-13', 'PG', 'G', 'R']
df_sub = df_sub.loc[df_sub['Rated'].isin(ratings)]
#df_sub['Runtime'].value_counts()
#df_sub['Genre_Single'].value_counts()
#df_sub['Rated'].value_counts()
df_sub.describe()
Out[14]:
In [15]:
# entire dataframe
plt.rcParams['figure.figsize'] = (15, 15)
_ = pd.tools.plotting.scatter_matrix(df_sub)
In [16]:
from patsy import dmatrices
patsy_formula = 'Total_Torrents ~ Prod_Budget + Year + Genre_Single'
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')
In [17]:
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()
Out[17]:
In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
mod_lr_score = model.score(x, y)
mod_lr_coef = model.coef_
In [19]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
# store results
mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
cv_mod_score = model.score(x_train, y_train)
In [20]:
# reset x, y otherwise errors occur
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
# store results
mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
cvKf_mod_score = clf2.score(x,y)
In [21]:
#NORMAL RESULTS
print('Model Linear Regression Score = {0}'.format(mod_lr_score))
print(' Mean Square Error = {0}'.format(mean_sq_err))
print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
print(' Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))
print('Cross Val. K-Fold Model Score = {0}'.format(cvKf_mod_score))
In [22]:
_ = plt.plot(y, model.predict(x), 'ro')
In [23]:
# entire dataframe
plt.rcParams['figure.figsize'] = (15, 15)
_ = pd.tools.plotting.scatter_matrix(df_sub)
In [24]:
df.columns
Out[24]:
In [25]:
df_sub['log_budg']=np.log(df_sub.Prod_Budget)
#df_sub['log_year']=np.log(df_sub.Year)
#df_sub['log_run']=np.log(df_sub.Runtime)
df_sub['log_tor']=np.log(df_sub.Total_Torrents)
trans = df_sub[['log_budg', 'Year', 'log_tor']]
plt.rcParams['figure.figsize'] = (15, 15)
_ = pd.tools.plotting.scatter_matrix(trans)
In [ ]:
In [26]:
log_patsy_formula = 'log_tor ~ log_budg + Year + Genre_Single'
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')
In [27]:
import statsmodels.formula.api as smf
results = smf.ols(formula=log_patsy_formula, data=df_sub,).fit()
results.summary()
Out[27]:
In [28]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
# store results
log_mod_lr_score = model.score(x,y)
In [29]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
# store results
log_mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cv_mod_score = model.score(x_train, y_train)
In [30]:
# reset x, y otherwise errors occur
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
# store results
log_mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cvKf_mod_score = clf2.score(x,y)
In [31]:
#LOG RESULTS
print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
print(' Log Mean Square Error = {0}'.format(log_mean_sq_err))
print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
print(' Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))
print('Log Cross Val. K-Fold Model Score = {0}'.format(log_cvKf_mod_score))
In [32]:
df_TEST = pd.read_csv('data/test_data2.csv', encoding='latin-1')
df_TEST['log_budg']=np.log(df_TEST.Prod_Budget)
df_TEST['log_run']=np.log(df_TEST.Runtime)
df_TEST['log_tor']=np.log(df_TEST.Total_Torrents)
def split_to_array(ser):
split_array = np.array(ser.strip().replace(',','').split(' '))
return pd.Series(split_array)
genres = df_yr.Genre.apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
df_TEST['Genre_Single'] = df_TEST.Genre.apply(convert_frequency)
log_patsy_formula_test = 'log_tor ~ log_budg + Year + Month + Genre_Single'
y, x = dmatrices(log_patsy_formula_test, data=df_TEST, return_type='dataframe')
print(clf2.score(x_test, y_test))
print(metrics.mean_squared_error(y_test,model.predict(x_test)))
In [33]:
#_ = plt.plot(y, model.predict(x), 'ro')
In [ ]: